{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# 随机梯度下降法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = 100000\n",
    "\n",
    "x = np.random.normal(size=m)\n",
    "X = x.reshape(-1,1)\n",
    "y = 4.*x + 3. + np.random.normal(0,3,size=m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 损失函数\n",
    "def J(theta, X_b, y):\n",
    "    try:\n",
    "        return np.sum((y - X_b.dot(theta))**2) / len(y)\n",
    "    except:\n",
    "        return float('inf')\n",
    "    \n",
    "def derivative_J(theta:np.ndarray, X_b:np.ndarray, y:np.ndarray):\n",
    "    \"\"\"\n",
    "    求θ为给定值时的导数(梯度)\n",
    "    :param theta: \n",
    "    :param X_b: \n",
    "    :param y: \n",
    "    :return: \n",
    "    \"\"\"\n",
    "    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)\n",
    "\n",
    "# 批量梯度下降法\n",
    "def gradient_descent(X_b, y, initial_theta, eta=0.01, n_iters=1e4, epsilon=1e-8):\n",
    "    theta = initial_theta\n",
    "    cur_iter = 0\n",
    "    while cur_iter < n_iters:\n",
    "        gradient = derivative_J(theta, X_b, y)\n",
    "        last_theta = theta\n",
    "        theta = theta - eta * gradient\n",
    "        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):\n",
    "            break\n",
    "        cur_iter += 1\n",
    "    return theta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 批量梯度下降法效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.36 s, sys: 93.2 ms, total: 1.45 s\nWall time: 1.45 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "X_b = np.hstack((np.ones((len(X), 1)), X))\n",
    "initial_theta = np.zeros(X_b.shape[1])\n",
    "eta = 0.01\n",
    "# 我们知道最终的系数和截距，直接肉眼比较吧。。。就不分训练集测试集了\n",
    "theta = gradient_descent(X_b,y,initial_theta, eta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3.01042744,  4.00071587])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "theta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 随机梯度下降法效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def derivative_J_sgd(theta:np.ndarray, X_b_i:np.ndarray, y_i):\n",
    "    \"\"\"\n",
    "    求随机搜索方向 \n",
    "    \"\"\"\n",
    "    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sgd(X_b, y, initial_theta, n_iters):\n",
    "    # 两个超参数\n",
    "    t0 = 5\n",
    "    t1 = 50\n",
    "    \n",
    "    def learning_rate(t):\n",
    "        return t0/(t+t1)\n",
    "    \n",
    "    theta = initial_theta\n",
    "    for cur_iter in range(n_iters):\n",
    "        # 随机选一个\n",
    "        rand_i = np.random.randint(len(X_b))\n",
    "        gradient = derivative_J_sgd(theta, X_b[rand_i], y[rand_i])\n",
    "        # 向搜索方向的相反方向移动η\n",
    "        theta = theta - learning_rate(cur_iter) * gradient\n",
    "    return theta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 276 ms, sys: 6.11 ms, total: 282 ms\nWall time: 283 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "X_b = np.hstack((np.ones((len(X), 1)), X))\n",
    "initial_theta = np.zeros(X_b.shape[1])\n",
    "theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3.02984824,  3.9936953 ])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "theta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "结论：批量梯度下降法和随机梯度下降法最终效果差不多，但是随机梯度下降法循环次数少得多，计算时间快得多"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
